Project : Top Movie Streaming¶

Analyzing Main Streaming Services

Skills Involved :¶

DATA ANALYSIS

DATA VISUALISING

DATA CLEANSING

PYTHON

Starting :¶

Importing Libraries¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import HTML
import plotly.offline as pyo
pyo.init_notebook_mode()
import plotly.express as px
import random
%matplotlib inline

Coding Ground¶

Data Cleansing¶

In [2]:
df=pd.read_csv('moviestreams.csv')
df.drop(['Unnamed: 0'],axis=1,inplace=True)
# df.to_csv('moviestreams.csv',index=False)
df
Out[2]:
Title Year Age IMDb Rotten Tomatoes Netflix Hulu Prime Video Disney+ Type Directors Genres Country Language Runtime
0 Inception 2010 13+ 8.8 87% 1 0 0 0 0 Christopher Nolan Action,Adventure,Sci-Fi,Thriller United States,United Kingdom English,Japanese,French 148.0
1 The Matrix 1999 18+ 8.7 87% 1 0 0 0 0 Lana Wachowski,Lilly Wachowski Action,Sci-Fi United States English 136.0
2 Avengers: Infinity War 2018 13+ 8.5 84% 1 0 0 0 0 Anthony Russo,Joe Russo Action,Adventure,Sci-Fi United States English 149.0
3 Back to the Future 1985 7+ 8.5 96% 1 0 0 0 0 Robert Zemeckis Adventure,Comedy,Sci-Fi United States English 116.0
4 The Good, the Bad and the Ugly 1966 18+ 8.8 97% 1 0 1 0 0 Sergio Leone Western Italy,Spain,West Germany Italian 161.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
16739 The Ghosts of Buxley Hall 1980 NaN 6.2 NaN 0 0 0 1 0 Bruce Bilson Comedy,Family,Fantasy,Horror United States English 120.0
16740 The Poof Point 2001 7+ 4.7 NaN 0 0 0 1 0 Neal Israel Comedy,Family,Sci-Fi United States English 90.0
16741 Sharks of Lost Island 2013 NaN 5.7 NaN 0 0 0 1 0 Neil Gelinas Documentary United States English NaN
16742 Man Among Cheetahs 2017 NaN 6.6 NaN 0 0 0 1 0 Richard Slater-Jones Documentary United States English NaN
16743 In Beaver Valley 1950 NaN NaN NaN 0 0 0 1 0 James Algar Documentary,Short,Family United States English 32.0

16744 rows × 15 columns

In [3]:
colors=['brown','red','orange','salmon','purple','blue','green','lightblue','lightsalmon']
df.shape
Out[3]:
(16744, 15)
In [4]:
df.count()
Out[4]:
Title              16744
Year               16744
Age                 7354
IMDb               16173
Rotten Tomatoes     5158
Netflix            16744
Hulu               16744
Prime Video        16744
Disney+            16744
Type               16744
Directors          16018
Genres             16469
Country            16309
Language           16145
Runtime            16152
dtype: int64
In [5]:
cols = df.columns.to_list()
cols
Out[5]:
['Title',
 'Year',
 'Age',
 'IMDb',
 'Rotten Tomatoes',
 'Netflix',
 'Hulu',
 'Prime Video',
 'Disney+',
 'Type',
 'Directors',
 'Genres',
 'Country',
 'Language',
 'Runtime']
In [6]:
df.isna().sum()
Out[6]:
Title                  0
Year                   0
Age                 9390
IMDb                 571
Rotten Tomatoes    11586
Netflix                0
Hulu                   0
Prime Video            0
Disney+                0
Type                   0
Directors            726
Genres               275
Country              435
Language             599
Runtime              592
dtype: int64

REMOVING '+' IN AGE :

In [7]:
#Age={'18+':18,'7+':7,'13+':13,'all':0,'16':16}
#df.Age=df.Age.map(Age)
#df

REMOVING '%' IN ROTTEN TOMATOES

In [8]:
df['Rotten Tomatoes'] = df['Rotten Tomatoes'].str.replace('%', '').astype(float)
df
Out[8]:
Title Year Age IMDb Rotten Tomatoes Netflix Hulu Prime Video Disney+ Type Directors Genres Country Language Runtime
0 Inception 2010 13+ 8.8 87.0 1 0 0 0 0 Christopher Nolan Action,Adventure,Sci-Fi,Thriller United States,United Kingdom English,Japanese,French 148.0
1 The Matrix 1999 18+ 8.7 87.0 1 0 0 0 0 Lana Wachowski,Lilly Wachowski Action,Sci-Fi United States English 136.0
2 Avengers: Infinity War 2018 13+ 8.5 84.0 1 0 0 0 0 Anthony Russo,Joe Russo Action,Adventure,Sci-Fi United States English 149.0
3 Back to the Future 1985 7+ 8.5 96.0 1 0 0 0 0 Robert Zemeckis Adventure,Comedy,Sci-Fi United States English 116.0
4 The Good, the Bad and the Ugly 1966 18+ 8.8 97.0 1 0 1 0 0 Sergio Leone Western Italy,Spain,West Germany Italian 161.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
16739 The Ghosts of Buxley Hall 1980 NaN 6.2 NaN 0 0 0 1 0 Bruce Bilson Comedy,Family,Fantasy,Horror United States English 120.0
16740 The Poof Point 2001 7+ 4.7 NaN 0 0 0 1 0 Neal Israel Comedy,Family,Sci-Fi United States English 90.0
16741 Sharks of Lost Island 2013 NaN 5.7 NaN 0 0 0 1 0 Neil Gelinas Documentary United States English NaN
16742 Man Among Cheetahs 2017 NaN 6.6 NaN 0 0 0 1 0 Richard Slater-Jones Documentary United States English NaN
16743 In Beaver Valley 1950 NaN NaN NaN 0 0 0 1 0 James Algar Documentary,Short,Family United States English 32.0

16744 rows × 15 columns

Top 10 languages in Streaming Movies¶

In [50]:
language=pd.DataFrame(dict(df.Language.value_counts().head(10)).items(),columns=['Languages','No. Of Movies'])
fig=px.bar(language,
           x=language.Languages,
           y=language['No. Of Movies'],
           title='Top 10 languages in Streaming Movies',
           text=language['No. Of Movies'],
           height=600)
fig.update_traces(texttemplate='%{text:.4s}',textposition='outside')
fig.show()
#HTML(fig.to_html())
In [10]:
fig=px.pie(language,names=language.Languages,values=language['No. Of Movies'],
           title='Top 10 languages in Streaming Services',
           height=600)
#fig.update_traces(textposition='outside')
fig.show()
#HTML(fig.to_html())

Number of Movies in specific age group in All services¶

In [11]:
#Age Graph Functions
def making_ageGraph(df:pd.DataFrame,stream:str,height:float=600):
    color=random.choice(colors)
    df={'Age':df.Age.value_counts().index,'Counts':df.Age.value_counts()}
    fig = px.bar(df, 
             x='Age',
             y='Counts',
             title=f"Number of Movies in specific age group in {stream} service",
             text='Counts', 
             height=height)
    fig.update_traces(marker_color=color,texttemplate='%{text:.2s}', textposition='outside') #for the text to be outside.
    fig.show()
    #return HTML(fig.to_html())
In [12]:
making_ageGraph(df,'All')

Number of Movies in specific age group in Netflix¶

In [13]:
netflix_df=df[df['Netflix']==1]
making_ageGraph(netflix_df,'Netflix')

Number of Movies in specific age group in Amazon Prime Video¶

In [14]:
prime_df=df[df['Prime Video']==1]
making_ageGraph(prime_df,'Amazon Prime Video')

Number of Movies in specific age group in Disney+¶

In [15]:
Disney_df=df[df['Disney+']==1]
making_ageGraph(Disney_df,'Disney+')

Number of Movies in specific age group in Hulu¶

In [16]:
Hulu_df=df[df['Hulu']==1]
making_ageGraph(Hulu_df,'Hulu')

Rotten Tomatoes Score¶

A Tomatometer score is calculated for a movie or TV show after it receives at least five reviews. When at least 60% of reviews for a movie or TV show are positive, a red tomato is displayed to indicate its Fresh status.

Rotten Tomatoes gives films a score out of 100 based on the averaged reviews of professional film critics. If a film gets a rating of 60 or more it gets a 'fresh' red tomato on the site. Less than 60 and it gets a rotten tomato.

In [17]:
fig = px.bar(df, 
             x=df['Rotten Tomatoes'].value_counts().index, 
             y=df['Rotten Tomatoes'].value_counts(),
             title="Overall Rotten Tomato Ratings",
             text=df['Rotten Tomatoes'].value_counts(), 
             height=600)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside') #for the text to be outside.
fig.show()
#HTML(fig.to_html())
In [18]:
rt_scores = pd.DataFrame({'Streaming Service': ["Prime Video", "Hulu","Disney+","NetFlix"],
                        'Rotten Tomato Score' : [netflix_df['Rotten Tomatoes'].value_counts().iloc[0], 
                                                prime_df['Rotten Tomatoes'].value_counts().iloc[0],
                                                Disney_df['Rotten Tomatoes'].value_counts().iloc[0],
                                                Hulu_df['Rotten Tomatoes'].value_counts().iloc[0]],
                         'Highest Value':[netflix_df['Rotten Tomatoes'].value_counts().index[0], 
                                        prime_df['Rotten Tomatoes'].value_counts().index[0],
                                        Disney_df['Rotten Tomatoes'].value_counts().index[0],
                                        Hulu_df['Rotten Tomatoes'].value_counts().index[0]]})
rt_scores.head()
Out[18]:
Streaming Service Rotten Tomato Score Highest Value
0 Prime Video 130 100.0
1 Hulu 257 100.0
2 Disney+ 19 100.0
3 NetFlix 18 100.0
In [19]:
rt_scores.sort_values(ascending=False, by="Rotten Tomato Score").plot(kind='bar', x='Streaming Service', y='Rotten Tomato Score', 
                            color='Violet', 
                            title="Streaming Service with 100% Rotten Tomato Score")
plt.show()
In [20]:
sorted_rt_score=rt_scores.sort_values(ascending=False, by="Rotten Tomato Score")
fig = px.bar(sorted_rt_score, 
             x=sorted_rt_score['Streaming Service'], 
             y=sorted_rt_score['Rotten Tomato Score'],
             title="Rotten Tomato Ratings For Each Services",
             text=sorted_rt_score['Rotten Tomato Score'], 
             height=600)
fig.update_traces(marker_color='purple',texttemplate='%{text:.2s}', textposition='outside') #for the text to be outside.
fig.show()
#HTML(fig.to_html())

IMDB Ratings¶

In [21]:
#IMDb Graph Functions
def making_IMDbGraph(df:pd.DataFrame,stream:str,height:float=600):
    color=random.choice(colors)
    df=pd.DataFrame(dict(df['IMDb'].value_counts()).items(),columns=['IMDb','Counts'])
    fig = px.bar(df, 
             x=df['IMDb'],
             y=df['Counts'],
             title=f"Overall IMDb Ratings For {stream} Service",
             text=df['IMDb'].value_counts(), 
             height=height)
    fig.update_traces(marker_color=color,texttemplate='%{text:.2s}', textposition='outside') #for the text to be outside.
    fig.show()
    #return HTML(fig.to_html())
For All Service¶
In [22]:
making_IMDbGraph(df,'All')
For Netflix¶
In [23]:
making_IMDbGraph(netflix_df,'Netflix')
For Amazon Prime¶
In [24]:
making_IMDbGraph(prime_df,'Amazon Prime')
For Disney+¶
In [25]:
making_IMDbGraph(Disney_df,'Diseny+')
For Hulu¶
In [26]:
making_IMDbGraph(Hulu_df,'Hulu')

Count of Runtime of Movies¶

In [27]:
RuntimeCount=pd.DataFrame(dict(df.Runtime.value_counts().sort_values(ascending=False)[:10]).items(),
                          columns=['Runtime','Counts'])
RuntimeCount
Out[27]:
Runtime Counts
0 90.0 971
1 95.0 489
2 92.0 434
3 93.0 422
4 85.0 408
... ... ...
152 19.0 8
153 32.0 8
154 9.0 8
155 7.0 8
156 10.0 8

157 rows × 2 columns

In [28]:
fig = px.bar(RuntimeCount, 
             x='Runtime', 
             y='Counts',
             title="Count Of Runtimes Of Movies",
             text=RuntimeCount['Runtime'], 
             height=600)
fig.update_traces(marker_color='purple',texttemplate='%{text:.2s}', textposition='outside')
fig.show()
#HTML(fig.to_html())

Name of Directors and No. of movies directed by them¶

In [29]:
df.Directors.value_counts()
Out[29]:
Jay Chapman               36
Joseph Kane               30
Cheh Chang                26
Sam Newfield              22
Jim Wynorski              22
                          ..
Richard Ciupka             1
Ric Esther Bienstock       1
Ben Browder                1
Anocha Suwichakornpong     1
Richard Slater-Jones       1
Name: Directors, Length: 11338, dtype: int64
In [30]:
DirCount=pd.DataFrame(dict(df.Directors.value_counts()).items(),
                          columns=['Director','No. Of Movies'])
DirCount.sort_values(by='No. Of Movies',ascending=False,inplace=True)
DirCount=DirCount.head(20)
DirCount
Out[30]:
Director No. Of Movies
0 Jay Chapman 36
1 Joseph Kane 30
2 Cheh Chang 26
3 Sam Newfield 22
4 Jim Wynorski 22
5 David DeCoteau 21
6 William Beaudine 21
7 Jay Karas 20
8 Raúl Campos,Jan Suter 20
9 Marcus Raboy 18
10 Fred Olen Ray 17
11 William Witney 17
12 Scott L. Montoya 17
13 Lesley Selander 16
14 Mark Atkins 16
15 Paul Hoen 15
16 William Nigh 14
20 Philip Gardiner 13
19 Manny Rodriguez 13
17 Robert N. Bradbury 13
In [31]:
fig = px.bar(DirCount, 
             x=DirCount['Director'], 
             y=DirCount['No. Of Movies'],
             title="Directors And The Count Of Movies They Have Directed",
             text=DirCount['No. Of Movies'],
             height=600)
fig.update_traces(textposition='outside')
fig.show()
#HTML(fig.to_html())

Movies Directed By Director¶

In [32]:
def movieDirectedBy(df:pd.DataFrame,name:str):
    dfn=df['Joseph Kane' == df.Directors]
    dfn.fillna('null',inplace=True)
    fig = px.bar(dfn, 
             y=dfn['IMDb'], 
             x=dfn['Title'],
             title=f"Movies Directed By {name}",
             text=dfn['Genres'],
             height=600)
    fig.update_traces(marker_color='salmon',textfont_size=10,textposition='inside')
    fig.show()
    #return HTML(fig.to_html())
In [33]:
movieDirectedBy(df,'Joseph Kane')
C:\Users\siddh\AppData\Local\Temp\ipykernel_6444\4066340104.py:3: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

No. Of Movies Of Different Genres¶

In [34]:
genres=dict(df.Genres.value_counts())
gen=[]
for i in genres.keys():
    i=i.split(',')
    for j in i:
        gen.append(j.strip())
genres_df=pd.DataFrame(dict(pd.Series(gen).value_counts()).items(),
                       columns=['Genres','No. Of Movies'])
genres_df
Out[34]:
Genres No. Of Movies
0 Drama 868
1 Comedy 654
2 Adventure 560
3 Action 553
4 Thriller 467
5 Family 426
6 Romance 420
7 Fantasy 371
8 Crime 347
9 Mystery 318
10 Sci-Fi 312
11 Horror 296
12 Animation 265
13 Documentary 249
14 History 198
15 Biography 190
16 Music 171
17 Musical 171
18 War 170
19 Western 168
20 Short 141
21 Sport 126
22 News 36
23 Film-Noir 25
24 Reality-TV 8
25 Talk-Show 8
26 Game-Show 6
In [35]:
fig=px.bar(genres_df,
           x=genres_df.Genres,
           y=genres_df['No. Of Movies'],
           title='Movies In Different Genres',
           text=genres_df['No. Of Movies'],
           height=600)
fig.update_traces(marker_color='brown',textfont_size=10,textposition='outside')
fig.show()
#HTML(fig.to_html())

Top Movies¶

In [36]:
def topMoviesIn(df:pd.DataFrame,stream:str,over:float=8.5):
    color=random.choice(colors)
    data= df[df['IMDb']>over]
    data= data[['Title', 'IMDb', 'Genres']].sort_values(ascending=False, by='IMDb')
    fig=px.bar(data,
           x=data.Title,
           y=data.IMDb,
           title=f'Top Movies in {stream}',
           text=data.Genres,
           height=600)
    fig.update_traces(marker_color=color,textposition='inside')
    fig.show()
    #return HTML(fig.to_html())
On Netflix¶
In [37]:
topMoviesIn(netflix_df,'Netflix')
On Amazon Prime¶
In [38]:
topMoviesIn(prime_df,'Amazon Prime',8.8)
On Disney+¶
In [39]:
topMoviesIn(Disney_df,'Disney+',8)
On Hulu¶
In [40]:
topMoviesIn(Hulu_df,'Hulu',8)

Movies Before 1990¶

In [41]:
def MoviesBefore(df:pd.DataFrame,stream:str,before:int=1990):
    avg=df.Runtime.mean()
    df=df[df.Year.astype(int)<before].nlargest(20,'IMDb','first')
    color=random.choice(colors)
    fig=px.bar(df,
           y=df.Title,
           x=df.Year,
           title=f'Movies Before 1990 On {stream} Stream',
           text=df.Year,
           height=600)
    fig.update_traces(marker_color=color,textposition='inside')
    fig.show()
    #return HTML(fig.to_html())
    
On All Streams¶
In [42]:
MoviesBefore(df,'All')
On Netflix¶
In [43]:
MoviesBefore(netflix_df,'Netflix')
On Amazon Prime¶
In [44]:
MoviesBefore(prime_df,'Amazon Prime')
On Disney+¶
In [45]:
MoviesBefore(Disney_df,'Disney+')
On Hulu¶
In [46]:
MoviesBefore(Hulu_df,'Hulu')

Average ScreanTime¶

In [47]:
netflix_avg = netflix_df.Runtime.mean()
prime_avg = prime_df.Runtime.mean()
Disney_avg = Disney_df.Runtime.mean()
Hulu_avg = Hulu_df.Runtime.mean()
avg = [df.Runtime.mean(),netflix_avg,prime_avg,Disney_avg,Hulu_avg]
AvgDf=pd.DataFrame({'Streaming Service':'Overall Netflix Amazon Disney+ Hulu'.split(' '),'Screen Time':avg})
AvgDf
Out[47]:
Streaming Service Screen Time
0 Overall 93.413447
1 Netflix 98.912900
2 Amazon 92.293980
3 Disney+ 90.425225
4 Hulu 97.396610
In [48]:
fig = px.bar(AvgDf,
             y=AvgDf['Streaming Service'],
             x=AvgDf['Screen Time'],
             title='Screen Time On Each Stream',
             text=AvgDf['Screen Time'],
             height=600)
fig.update_traces(texttemplate='%{text:.4s} mins',textposition='inside')
fig.show()
#HTML(fig.to_html())
In [49]:
!jupyter nbconvert --to html TopMovieStreaming.ipynb
[NbConvertApp] Converting notebook TopMovieStreaming.ipynb to html
[NbConvertApp] Writing 82041920 bytes to TopMovieStreaming.html